************************************
*********** PROJECT INFO ***********
************************************

// Record Linkage for Character-Based Names
// Clean 1900 names
// Author: Hannah Postel
// Date: 10/26/2022

***************************************
********* BASIC NAME CLEANING *********
***************************************

// 1900 census data downloaded from NBER
// Males born in China, Chinese race, 48 contiguous states only
// N=76,484

use "$data/1900_chinese.dta", clear

*****************************************
************** CLEAN NAMES **************
*****************************************

*** SPECIFY MISSING NAMES ***
// N=1,636
replace namefrst="." if namefrst=="" | namefrst=="UNKNOWN"
// N=6,388
replace namelast="." if namelast=="" | namelast=="UNKNOWN"

*** DROP THOSE WITH BOTH NAMES MISSING ***
// Did not do this here for comparison with other matching methods, who do NOT drop missing names
// Recommend including this step in standalone analyses
// drop if namelast=="." & namefrst=="." // N=252

*** MANUAL CLEANING ***

// Mostly checking names with embedded specials (*, ?) with enumeration forms
// 1900-specific problem only

// "First" names
replace namefrst="CHARLIE AH" if namefrst=="CH**LU AH"
replace namefrst="C" if namefrst=="CH*ATH" // illegible
replace namefrst="CHANG" if namefrst=="CH*G"
replace namefrst="CHARLEY" if namefrst=="CHI*KEY"
replace namefrst="CHIN MAU" if namefrst=="CHIN M*R"
replace namefrst="FOOK" if namefrst=="FR*K"
replace namefrst="GUNG WO" if namefrst=="G* W"
replace namefrst="GAM" if namefrst=="G*M"
replace namefrst="QUON" if namefrst=="G*ON"
replace namefrst="AH GEST" if namefrst=="GE* AH"
replace namefrst="DOCK WO" if namefrst=="H* LEE"
replace namefrst="H" if namefrst=="H*N"
replace namefrst="HIM JONG" if namefrst=="HIM JO*G"
replace namefrst="CHEOY" if namefrst=="I*OY"
replace namefrst="JIAN" if namefrst=="J*EN"
replace namefrst="JING" if namefrst=="J*G"
drop if namefrst=="J*LE" // wrongly entered as chinese
replace namefrst="J" if namefrst=="JR*G"
replace namefrst="HAP JUNK" if namefrst=="JUNK *AP"
replace namefrst="KUNG" if namefrst=="K*G"
replace namefrst="HEM LUNG" if namefrst=="KIM T*G"
replace namefrst="KNOW" if namefrst=="KN*W"
replace namefrst="L" if namefrst=="L O*RY"
replace namefrst="H" if namefrst=="L* H"
replace namefrst="LUNG" if namefrst=="L*G"
replace namefrst="LOUIE" if namefrst=="L*IE"
replace namefrst="LEE" if namefrst=="LEE (ONG-SU-PO*)"
replace namefrst="LEE JOE" if namefrst=="LEE *OE"
replace namefrst="LY F" if namefrst=="LY* F" // illegible
replace namefrst="IMG MA" if namefrst=="MA *MG"
replace namefrst="MAU C" if namefrst=="MA* C"
replace namefrst="MOON MING" if namefrst=="MO*N MING"
replace namefrst="NOON" if namefrst=="N*N"
replace namefrst="O C" if namefrst=="O* C" // illegible
replace namefrst="CHARLEY" if namefrst=="P*LEY"
replace namefrst="POY HUP" if namefrst=="POY H*P"
replace namefrst="QUING L" if namefrst=="QU* L"
replace age=46 if namefrst=="QU*K" // birth year 1854
replace namefrst="" if namefrst=="QU*K" // illegible
replace namefrst="SING KEE" if namefrst=="S* KEE"
replace namefrst="SING" if namefrst=="S*G"
replace namefrst="SAM LEE" if namefrst=="SA* LEE"
replace namefrst="GONG" if namefrst=="SCHE* HONG"
replace namefrst="SHEN TUCK" if namefrst=="SH* TUCK"
replace namefrst="SHUCK" if namefrst=="SH*K"
replace namefrst="SHUN U" if namefrst=="SHUN U*NG"
replace namefrst="SHU" if namefrst=="SHUU F*G"
replace namefrst="SI LEONG" if namefrst=="SI *OUG"
replace namefrst="SING HOP" if namefrst=="SIN*Y"
replace namefrst="TH" if namefrst=="TH*N"
replace namefrst="WO TOM" if namefrst=="TOM * WO"
replace namefrst="SUNG" if namefrst=="V*G"
replace namefrst="W" if namefrst=="W**EE"
replace namefrst="WUNG" if namefrst=="W*G" & namelast=="DAN"
replace namefrst="WONG" if namefrst=="W*G" & namelast=="YUR"
replace namefrst="W" if namefrst=="W*N"
replace namefrst="W" if namefrst=="W*O" // illegible
replace namefrst="YING" if namefrst=="Y*G"
replace namefrst="YEE G" if namefrst=="YEE G*CH"
replace namefrst="BUNG YE YIM" if namefrst=="YUM B*** YE"
replace namefrst="GIT" if namefrst=="GIT ALIAS HENRY"
replace namefrst="SUTO" if namelast=="S*TO"
replace namefrst="FOY" if namelast=="W*OY"
replace namefrst="AH" if namelast=="CL*G"
replace namefrst="MHDOS" if namefrst=="M H D O S"
replace namefrst="LING" if namefrst=="SING WAN OR LING"
replace namefrst="WOO" if namefrst=="WOO OR NEW YORK"
replace namefrst="AH CHING" if namefrst=="AH CHING OR JACK"
replace namefrst="WY" if namefrst=="WY OR WY LEE"
replace namefrst="LOY" if namefrst=="L8Y"

// "Last" names
replace namelast="SING" if namelast=="LING" & namefrst=="H*N"
replace namelast="QUING" if namelast=="C*AH"
replace namelast="CHONG" if namelast=="CH*G"
replace namelast="CHUNG" if namelast=="CH*NG"
replace namelast="CHANG" if namelast=="CL*G"
replace namelast="" if namelast=="D*G" // NOT FOUND IN ANCESTRY; will get dropped as 1-part anyway
replace namelast="FONG" if namelast=="F*G"
replace namelast="KING" if namelast=="H*G"
replace namelast="JIM" if namelast=="J*N"
replace namelast="KING" if namelast=="K*G"
replace namelast="LUNG" if namelast=="L*G"
replace namelast="L" if namelast=="LA*ING" // illegible
replace namelast="LOMVEII" if namelast=="LO*I" 
replace namelast="NUNG" if namelast=="N**G"
replace namelast="R" if namelast=="R*G" // illegible
replace namelast="QUONG" if namelast=="R*Y"
replace namelast="SING" if namelast=="S**G"
replace namelast="SUE" if namelast=="S*TO"
replace namelast="TUNG" if namelast=="T*Y"
replace namelast="U" if namelast=="U*NG" // illegible
replace namelast="W" if namelast=="W*OY"
replace namelast="QUONG" if namelast=="ZU*NG"
replace namelast="LU" if namelast=="LU OR LEE"
replace namelast="SANG" if namelast=="SANG OR SONG"

*** REPLACE SPECIALS WITH SPACES/REMOVE ***
replace namefrst = subinstr(namefrst, "-", " ",.)
replace namefrst = subinstr(namefrst, "(", "",.) 
replace namefrst = subinstr(namefrst, ")", "",.) 
replace namefrst = subinstr(namefrst, "?", "",.)
replace namefrst = subinstr(namefrst, "*", "",.)
replace namefrst = subinstr(namefrst, ",", "",.)
replace namefrst = subinstr(namefrst, "#", "",.)
replace namefrst = subinstr(namefrst, "'", "",.)
replace namefrst = subinstr(namefrst, ".", "",.) if namefrst!="."

replace namelast = subinstr(namelast, "-", " ",.) 
replace namelast = subinstr(namelast, "*", "",.) 
replace namelast = subinstr(namelast, "(", "",.) 
replace namelast = subinstr(namelast, ")", "",.) 
replace namelast = subinstr(namelast, "?", "",.)
replace namelast = subinstr(namelast, ",", "",.) 
replace namelast = subinstr(namelast, "'", "",.)
replace namelast = subinstr(namelast, ".", "",.) if namelast!="."

*** TRIM ADDITIONAL SPACES ***
// Internal
replace namefrst = stritrim(namefrst)
replace namelast = stritrim(namelast)
// External (leading and trailing)
replace namefrst = strtrim(namefrst)
replace namelast = strtrim(namelast)

*** FORMATTING "AH" HONORIFIC ***
replace namefrst = subinstr(namefrst, "A H", "AH",.)
replace namefrst = subinstr(namefrst, "A ", "AH ",.)
replace namefrst = "AH" if namefrst=="A"
replace namefrst = "AH" if namefrst=="AA"
replace namefrst = subinstr(namefrst, "AAH ", "AH ",.)

*** DROP NON-REAL NAMES ***
// Most with "China" as a name fragment have either 1 or 0 additional fragments
// Clean those that do have full names
replace namefrst="GIB" if namefrst=="CHINAH GIB"
replace namefrst="LEW" if namefrst=="LEW CHINAMAN"

drop if strpos(namelast, "CHINA") > 0  // 13 obs
drop if strpos(namefrst, "CHINA") > 0 // 22 obs

*** STANDARDIZE AMERICANIZED SPELLINGS ***
replace namefrst="CHARLIE" if namefrst=="CHARLEY" | namefrst=="CHARLY" | namefrst=="CHARLI" 
replace namefrst="CHARLES" if namefrst=="CHAS."| namefrst=="CHAS"
replace namelast="CHARLIE" if namelast=="CHARLEY" | namelast=="CHARLY" | namelast=="CHARLIS" 

**** LOOK THROUGH AGAIN FOR PEOPLE WHO SHOULD HAVE SPACES BUT DON'T OR VICE VERSA ****

// clean the unclear readings
// "a or b" formatting
/* CORO OR COW
FOOK OR TOOK
MUEN OR MUEU
DEONG OR SEONG
JUNG OR LUNG
MEDUE OR MEDUC
YING OR ZING
JEU OR JEN
CHEN OR CHEW
YOU OR YOW */

// REVIEW FOR POSSIBLE FIVE-PART NAMES

/* CHEW SAM SING | WOO WEE
CHING CHIN | BOW CHING CHU
LEE YOW | KIN LEE YOW */


// This is the "light" clean for the baseline match
// N=76,448
save "$data/1900_chn_clean.dta", replace


*****************************************
************** SEGMENTATION *************
*****************************************

*** PREPARING FOR SEGMENTATION ***

// Create indicators for whether each name has a space
gen namefrst_mult =  strpos(namefrst, " ") > 0
gen namelast_mult =  strpos(namelast, " ") > 0

replace namefrst="" if namefrst=="."
replace namelast="" if namelast=="."

// Variables noting the number of spaces
gen nspace_frst = length(namefrst) - length(subinstr(namefrst," ", "", .))
gen nchar_frst=(nspace_frst+1) if namefrst!=""
replace nchar_frst=0 if nchar_frst==.

gen nspace_last = length(namelast) - length(subinstr(namelast," ","", .))
gen nchar_last=(nspace_last+1) if namelast!=""
replace nchar_last=0 if nchar_last==.

gen nspace_tot = nspace_frst + nspace_last
gen nchar_tot = nchar_frst + nchar_last

*** SEGMENTING "FIRST" NAMES ***

split namefrst, gen(name)

// Fix two character surnames
replace name1="SOOHOO" if ((name1=="SOO" & name2=="HOO") | (name1=="SEE" & name2=="HOO") | (name1=="SOO" & name2=="HO"))
replace name1= "OWYANG" if ((name1=="OW" & name2=="YOUNG") | (name1=="OW" & name2=="YANG") | (name1=="OU" & name2=="YONG"))
replace name2=name3 if (name1=="SOOHOO" | name1=="OWYANG") & name3!=""
replace name3=name4 if (name1=="SOOHOO" | name1=="OWYANG")

// Fill in blanks with missing dot
// Working to "fill" all blank spots
replace name2="." if namelast=="" & name2==""
replace name3="." if namelast=="" & name3==""
replace name4="." if namelast=="" & name4==""

*** SEGMENTING "LAST" NAMES ***
split namelast, gen(othername)
order namefrst name1-name4 namelast othername1-othername3

// Fix two character surnames
replace othername1="SOOHOO" if ((othername1=="SOO" & othername2=="HOO") | (othername1=="SOO" & othername2=="HO") | (othername1=="SOO" & othername2=="HU") | (othername1=="SU" & othername2=="HO") | (othername1=="SU" & othername2=="HOO") | (othername1=="SU" & othername2=="HU") | (othername1=="SEE" & othername2=="TOU") | (othername1=="SEE" & othername2=="TOW") | (othername1=="SEE" & othername2=="HOU")| (othername1=="SEE" & othername2=="HO"))

replace othername1= "OWYANG" if ((othername1=="OW" & othername2=="YOUNG") | (othername1=="OY" & othername2=="YOUNG") | (othername1=="OU" & othername2=="YUNG"))

replace othername2=othername3 if (othername1=="SOOHOO" | othername1=="OWYANG") & othername3!=""
replace othername2=othername3 if (othername1=="SOOHOO" | othername1=="OWYANG")
replace othername3="" if (othername1=="SOOHOO" | othername1=="OWYANG")

*** REASSIGN NAME FRAGMENTS TO NEW VARS ***

// Filling in blanks
replace name1=othername1 if name1=="" & othername1!=""

// 1-fragment name in both locations
replace name2=othername1 if nspace_tot==0 & nchar_tot==2

// Blank "first" names
replace name2=othername2 if name2=="" & namefrst=="" & othername2!=""
replace name3=othername3 if name3=="" & namefrst=="" & othername3!=""

// Fill in missing values
replace name3="." if nchar_tot<3
replace name4="." if nchar_tot<4

// 3-fragment names
// 2-fragment "first" + 1-fragment "last"
replace name3=othername1 if name3=="" & nchar_frst==2 & nchar_tot==3
// 1-fragment "first" + 2-fragment "last" (fragment 1)
replace name2=othername1 if name2=="" & nchar_frst==1 & nchar_tot==3
// 1-fragment "first" + 2-fragment "last" (fragment 2)
replace name3=othername2 if name3=="" & nchar_frst==1 & nchar_tot==3
replace name3="." if name3=="" & (name2=="SOOHOO" | name2=="OWYANG")

// 4-fragment names
// 2-fragment + 2-fragment (fragment 1)
replace name3=othername1 if name3=="" & nchar_frst==2 & nchar_tot==4
// 2-fragment + 2-fragment (fragment 2)
replace name4=othername2 if name4=="" & nchar_frst==2 & nchar_tot==4
// 3-fragment + 1-fragment
replace name4=othername1 if name4=="" & nchar_frst==3 & nchar_tot==4
// 1-fragment + 3-fragment
replace name2=othername1 if name2=="" & nchar_frst==1 & nchar_tot==4
replace name3=othername2 if name3=="" & nchar_frst==1 & nchar_tot==4
replace name4=othername3 if name4=="" & nchar_frst==1 & nchar_tot==4
replace name4="." if (name3=="SOOHOO" | name3=="OWYANG")

// Formatting
// Take out periods from longer names
replace name1 = subinstr(name1, ".", "",.)
replace name2 = subinstr(name2, ".", "",.)  

// Final cleaning
replace name1="AH" if name1=="A"
replace name2=name4 if name2=="." & name3=="."
replace name4="." if name2==name4 & name3=="."
replace name4="." if namefrst=="CHAN HOY WAN" | namefrst=="LE WAY FONG" | namefrst=="MALS LING CHING"
replace name2="YEE" if namelast=="SOO HOO YEE"
replace name3="SOOHOO" if namelast=="SOO HOO YEE"
replace name3="WEE" if namefrst=="SOO HOO YEE" & namelast=="WEE"
replace name4="." if namefrst=="SOO HOO YEE" & namelast=="WEE"
replace name2="." if name2==""
replace name1="." if name1==""

order name1-name4 namefrst namelast othername1-othername3
sort name1 name2 name3 name4 

// Save final segmented dataset
save "$data/1900_chn_finalnames.dta", replace
